#!/usr/bin/env python


"""
download TreasuryDirect data
download and clean additional Treasury data
download TreasuryDirect PDFs, convert to txt files and parse
create combined dataset with intraday auction result release times
from combined sources (treasury direct data, pdfs, and bloomberg articles)
cleaned and merged datasets stored in data/input/TreasuryDirect/treasury_direct.h5

note: requires pdftotext utility (from poppler)
"""

import numpy as np
import pandas as pd

import datetime
import requests
from io import StringIO

import urllib
from bs4 import BeautifulSoup

import os
import os.path
import glob
import re
import shutil

# main directory
CODE_DIR = os.path.dirname(os.path.realpath(__file__))
TREASURYDIRECT_DIR = os.path.realpath(os.path.join(CODE_DIR, '../data/input/TreasuryDirect'))
BLOOMBERG_DIR = os.path.realpath(os.path.join(CODE_DIR, '../data/input/Bloomberg'))
# HDF file for auction data
TREASURYDIRECT_HDF = os.path.join( TREASURYDIRECT_DIR, 'treasury_direct.h5')

# cleaned data directory
DATA_DIR = os.path.join(CODE_DIR, '../data/input')


################################################################################
# read/write to HDf file

def save_treasurydirect(df, hdf_key, **kwargs):
    """save cleaned/collapsed treasurydirect data to HDF file"""
    df.to_hdf(TREASURYDIRECT_HDF, hdf_key, **kwargs)
    return

def read_treasurydirect(hdf_key, **kwargs):
    """read cleaned/collapsed treasurydirect data from HDF file"""
    df = pd.read_hdf(TREASURYDIRECT_HDF, key=hdf_key, **kwargs)
    return df

################################################################################

# code to download and clean raw TreasuryDirect data from website
def read_raw_treasurydirect(start_date=None, end_date=None):
    """download raw TreasuryDirect auction data"""
    # data in JSON format
    url = 'http://www.treasurydirect.gov/TA_WS/securities/search?format=json'
    # optinally: subset of data
    if start_date is not None:
        url += '&startDate=%s' % start_date
    if end_date is not None:
        url += '&endDate=%s' % end_date
    # send request and get data
    print("Downloading TreasuryDirect data...", flush=True)
    headers = {"Connection": "keep-alive"}
    req = requests.get(url, headers=headers)
    data = StringIO(req.text)
    print("Converting to DataFrame", flush=True)
    df = pd.read_json(data)
    return df


def clean_treasurydirect(df):
    """clean raw TreasuryDirect data"""
    # Convert numeric and datetime variables
    numvars = ['accruedInterestPer100','accruedInterestPer1000',
        'adjustedAccruedInterestPer1000','adjustedPrice',
        'allocationPercentage','allocationPercentageDecimals',
        'auctionDateYear','averageMedianDiscountMargin',
        'averageMedianDiscountRate','averageMedianInvestmentRate',
        'averageMedianPrice','averageMedianYield',
        'bidToCoverRatio','competitiveAccepted',
        'competitiveBidDecimals','competitiveTendered',
        'currentlyOutstanding','directBidderAccepted',
        'directBidderTendered','estimatedAmountOfPubliclyHeldMaturingSecuritiesByType',
        'fimaNoncompetitiveAccepted','fimaNoncompetitiveTendered',
        'frnIndexDeterminationRate','highDiscountMargin',
        'highDiscountRate','highInvestmentRate',
        'highPrice','highYield',
        'indexRatioOnIssueDate','indirectBidderAccepted',
        'indirectBidderTendered','interestRate',
        'lowDiscountMargin','lowDiscountRate',
        'lowInvestmentRate','lowPrice',
        'lowYield','maximumCompetitiveAward',
        'maximumNoncompetitiveAward','maximumSingleBid',
        'minimumBidAmount','minimumStripAmount',
        'minimumToIssue','multiplesToBid',
        'multiplesToIssue','nlpExclusionAmount',
        'nlpReportingThreshold','noncompetitiveAccepted',
        'offeringAmount','pricePer100',
        'primaryDealerAccepted','primaryDealerTendered',
        'refCpiOnDatedDate','refCpiOnIssueDate',
        'somaAccepted','somaHoldings',
        'somaTendered','spread',
        'standardInterestPaymentPer1000','tiinConversionFactorPer1000',
        'totalAccepted','totalTendered',
        'treasuryRetailAccepted','unadjustedAccruedInterestPer1000',
        'unadjustedPrice']
    for numvar in numvars:
        df[numvar] = pd.to_numeric(df[numvar])
    # Fix date variables - to datetime
    datevars = [u'announcementDate',u'auctionDate',u'backDatedDate',
        u'callDate',u'calledDate',u'datedDate',u'firstInterestPaymentDate',
        u'frnIndexDeterminationDate',u'issueDate',u'maturingDate',u'maturityDate',
        u'originalDatedDate',u'originalIssueDate',u'updatedTimestamp']
    TD_fmt = "%Y-%m-%dT%H:%M:%S"
    for datevar in datevars:
        df[datevar] = pd.to_datetime(df[datevar],format=TD_fmt,errors='coerce')
    # Boolean variables (with missing)
    boolvars = ['backDated','callable','cashManagementBillCMB',
        'competitiveTendersAccepted','fimaIncluded','floatingRate',
        'noncompetitiveTendersAccepted','reopening','somaIncluded',
        'strippable','treasuryRetailTendersAccepted','tips']
    bool_dict = {'Yes':1,'No':0,'':np.nan}
    for boolvar in boolvars:
        df[boolvar] = df[boolvar].map(bool_dict)
        df[boolvar] = pd.to_numeric(df[boolvar])
    
    # For the rest of columns, convert any unicode objects to string
    types = df.apply(lambda x: pd.api.types.infer_dtype(x.values, skipna=True))
    for var in types[types=='unicode'].index:
        df[var] = df[var].astype(str)
    return df


def save_treasury_direct_data():
    """download and save all TreasuryDirect auction data"""
    df_raw = read_raw_treasurydirect()
    df = clean_treasurydirect(df_raw)
    # save to table format to avoid reading errors
    save_treasurydirect(df, 'auction_data', mode='a', format='table')
    return


########################################################################

# allocation data from treasury
def read_raw_treasury_allocation():
    """download raw additional Treasury auction allocation data"""
    # data from https://home.treasury.gov/data/investor-class-auction-allotments
    # note: data updated regularly; but old xls files should still be accessible
    coupon_xls = 'https://home.treasury.gov/system/files/276/' + \
        'April-22-2020-IC-Coupons.xls'
    bill_xls = 'https://home.treasury.gov/system/files/276/' + \
        'April-7-2020-IC-Bills.xls'
        
    hist_coupon_xls = 'https://home.treasury.gov/system/files/276/' + \
        '/Website-PDO-4-A-Coupons-Jan-2000-Sep%202009.xls'
    hist_bill_xls = 'https://home.treasury.gov/system/files/276/' + \
        'Website-IC-allotments-Bills-Aug-2001-Sep-2009.xls'
    
    # loop over each file and append
    df_all = pd.DataFrame()
    xls_urls = (coupon_xls, bill_xls, hist_coupon_xls, hist_bill_xls )
    # cleaning varies for each xls file
    # Dollar values in millions/billions
    rescales = (10**9, 10**9, 10**6, 10**6 )
    drop_cols = ( [1,2,4,5], [1,2,4,5], [1,2,3,5,6], [1,2,4,5] ) 
    for xls_url, drop_c, rescale in zip(xls_urls, drop_cols, rescales):
        print(xls_url)
        df = pd.read_excel(xls_url, skiprows=3, engine="xlrd")
        df.drop(df.columns[drop_c], axis=1, inplace=True)
        # rename and rescale numeric data
        df.columns = ['issueDate','cusip','somaAccepted','depositoryAccepted',
            'individualsAccepted','dealersAccepted','pensionsAccepted',
            'investmentFundsAccepted','foreignAccepted','otherAccepted']
        for var in df.columns:
            if var in ['issueDate','cusip']: continue
            df[var] *= rescale
        # Drop missing
        df = df[df.cusip.notnull()]
        # Fix dates
        df.issueDate = pd.to_datetime(df.issueDate,format="%Y-%m-%d %H:%M:%S")
        df_all = pd.concat([df_all, df])
    return df_all
    

def save_treasury_allocation():
    """download and save investor allocation data"""
    df = read_raw_treasury_allocation()
    save_treasurydirect(df, 'investor_allocations', mode='a', format='table')

########################################################################




########################################################################
# bloomberg timestamps
def save_bloomberg_auction_release_times():
    """release times, from Bloomberg articles"""
    # auction article times from bloomberg
    fname = os.path.join(BLOOMBERG_DIR, 'BloombergAuctionTimes.csv')
    df = pd.read_csv(fname)

    # convert to datetime
    auction_dates = df['auctionDate'].str.replace('T00:00:00', '')
    df['auctionDate'] = pd.to_datetime(auction_dates)

    news_times = df['bloombergResultsTimestamp'].str.replace('\[BN\]', '')
    df['bloombergResultsTimestamp'] = pd.to_datetime(news_times)
    save_treasurydirect(df, 'bloomberg_data', mode='a', format='table')
    return


########################################################################
# Extra data from PDFs
def get_auction_pdf_urls(pdf_type):
    """urls and pdfs for release data"""
    pdf_urls = []
    pdf_fnames = []
    output_path = os.path.join(TREASURYDIRECT_DIR, 'pdf/%s/' % pdf_type)
    # create directory if does not exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Load TreasuryDirect data, keep only auctions with PDF
    df = read_treasurydirect('auction_data')
    df_sub = df.loc[df[pdf_type]!='']
    # Loop over PDFs and save
    urlbase = "https://www.treasurydirect.gov/instit/annceresult/press/preanre"
    for idx, row in df_sub.iterrows():
        # url year differs if announcement or auction
        if pdf_type=='pdfFilenameAnnouncement':
            url_year = row['announcementDate'].year
        elif pdf_type=='pdfFilenameCompetitiveResults':
            url_year = row['auctionDate'].year
        date = pdf_type
        pdfname = row[pdf_type]
        pdf_urls.append( ("%s/%d/%s" % (urlbase, url_year, pdfname) ) )
        pdf_fnames.append( os.path.join(output_path, 
            row['auctionDate'].strftime("%Y%m%d_") + pdfname) )
    return pdf_urls, pdf_fnames

def get_additional_stats_pdf_urls():
    """urls and pdfs for release data"""
    pdf_urls = []
    pdf_fnames = []
    output_path = os.path.join(TREASURYDIRECT_DIR, 'pdf/auctdata_stat/')
    # create directory if does not exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    # scrape pdf names from table
    url_ofstat = ( "https://www.treasurydirect.gov/auctions/announcements-data-results/" + 
        "additional-statistics-historical/" )
    print("Loading auctdata_stat html", flush=True)
    try:
        html = urllib.request.urlopen(url_ofstat).read()
        soup = BeautifulSoup(html, 'lxml')
    except Exception:
        print("Failed.", flush=True)
    # Stats for 2003-2008 saved in html tables
    tbls_all = soup.find_all('table')
    for tbl in tbls_all:
        # PDF names are stored in hrefs
        pdf_list = tbl.find_all("a",href=True)
        for r in pdf_list:
            pdfname = r['href']
            pdf_urls.append( "http://www.treasurydirect.gov" + pdfname )
            idx_s = pdfname.rfind('/')
            pdf_fnames.append( os.path.join(output_path, pdfname[idx_s+1:]) )
    return pdf_urls, pdf_fnames


def save_convert_pdf(url, fname, use_existing=True):
    """save pdf to file, converted to txt file"""
    print("Loading %s ..." % url, end='', flush=True)
    # check if file exists before downloading
    success = use_existing and os.path.isfile(fname)
    # multiple attempts
    i = 0
    while (not success) and (i<=5):
        i+=1
        try:
            urllib.request.urlretrieve(url, fname)
            success = True
        except Exception:
            pass
    if not success:
        print('failed', flush=True)
    else:
        # Convert to text
        print("convert to txt", flush=True)
        sys_call = "pdftotext -layout %s %s.txt" % (fname,fname)
        os.system( sys_call )
    return 


def save_convert_multiple_pdfs(pdf_urls, pdf_fnames, use_existing=True):
    """save and convert list of multiple pdfs"""
    for pdf_url,pdf_fname in zip(pdf_urls,pdf_fnames):
        save_convert_pdf(pdf_url, pdf_fname, use_existing=use_existing)
    return


def download_treasury_direct_pdfs(get_announcement=True, get_results=True, get_additional=True):
    """Download PDFs of announcement/competitive/additional results."""
    # Get pdf urls and filenames
    if get_announcement:
        # announcements
        print('Announcement', flush=True)
        pdf_urls, pdf_fnames = get_auction_pdf_urls('pdfFilenameAnnouncement')
        save_convert_multiple_pdfs(pdf_urls, pdf_fnames, use_existing=True)
    
    if get_results:
        # results
        print('Results', flush=True)
        pdf_urls, pdf_fnames = get_auction_pdf_urls('pdfFilenameCompetitiveResults')
        save_convert_multiple_pdfs(pdf_urls, pdf_fnames, use_existing=True)
    
    if get_additional:
        # additional stats
        print('Additional statistics', flush=True)
        pdf_urls, pdf_fnames = get_additional_stats_pdf_urls()
        save_convert_multiple_pdfs(pdf_urls, pdf_fnames, use_existing=True)
    return 



# check conversions were successful
def _check_pdf_txt_conversion(pdf_fnames):
    """check that all pdfs converted to non-empty txt files"""
    missing_files = []
    small_files = []
    for pdf_fname in pdf_fnames:
        txt_fname = pdf_fname + '.txt'
        if os.path.isfile(pdf_fname) and os.path.isfile(txt_fname):
            # check size (in bytes)
            if os.stat(txt_fname).st_size<200:
                small_files.append( pdf_fname )
        else:
            missing_files.append( pdf_fname )
    # move pdf files of empty txt files to directory for manual processing
    for f_path in small_files:
        # get filename without path
        _, fn = os.path.split(f_path)
        fc_path = os.path.join(TREASURYDIRECT_DIR, 'pdf/pdf_convert_manually/', fn)
        # copy to new location
        shutil.copyfile(f_path, fc_path)
        
    return missing_files, small_files

# check conversions were successful
def _check_all_treasury_direct_pdf_txt_conversion():
    """check that all PDFs have been converted to txt files"""
    print('Announcement', flush=True)
    _, pdf_fnames = get_auction_pdf_urls('pdfFilenameAnnouncement')
    missing_files, small_files = _check_pdf_txt_conversion(pdf_fnames)
    print('missing:')
    print(missing_files)
    
    # results
    print('Results', flush=True)
    _, pdf_fnames = get_auction_pdf_urls('pdfFilenameCompetitiveResults')
    missing_files, small_files = _check_pdf_txt_conversion(pdf_fnames)
    print('missing:')
    print(missing_files)
    
    # additional stats
    print('Additional statistics', flush=True)
    _, pdf_fnames = get_additional_stats_pdf_urls()
    missing_files, small_files = _check_pdf_txt_conversion(pdf_fnames)
    print('missing:')
    print(missing_files)
    return 

########################################################################

# Parsing functions -- announcements
def get_announcement_data():
    """Parse txt files for announcements."""
    # Functions
    def find_all(a_str, sub):
        # Find all function
        start = 0
        while True:
            start = a_str.find(sub, start)
            if start == -1: return
            yield start
            start += len(sub)
    
    def get_release_time(d):
        # Get time of announcement
        # All files contain some release info at top
        d = d.lower()
        idx = list(find_all(d,'for immediate release'))
        idx+= list(find_all(d,'for release when authorized'))
        if len(idx)==1:
            return ''
        # One typo (emargoed)
        idx = list(find_all(d,'argoed until'))
        idx_s = len('argoed until') + 1
        # time is listed after embargoed phrase
        d = d[idx[0]+idx_s:idx[0]+idx_s+20]
        d = d.replace(".", "")
        if d.find('noon')==-1: idx_e = d.find('m')+1
        elif d.find('m')==-1: idx_e = d.find('noon')+4
        else: idx_e = min([ d.find('m')+1, d.find('noon')+4])
        d = d[:idx_e].strip().upper()
        #d = ' '.join(d.split())
        return d
    
    def get_closing_time(d):
        # All files have no more than 1 'Receipt of Tenders'
        idx = list(find_all(d,'Receipt of Tenders'))
        if len(idx)==1:
            d = d[idx[0]:]
            # All files with 'Receipt of Tenders' have 1 or 2 subsequent
            # 'Competitive tenders'
            idx = list(find_all(d,'Competitive tenders'))
            if len(idx)==2:
                return -1
            d = d[idx[0]:]
            # All files with 1 'Competitive tenders' have 1 or 2 subsequent
            # 'Prior to '
            idx = list(find_all(d,'Prior to'))
            if len(idx)==2:
                return -1
            d = d[idx[0]:]
            idx_s = len('Prior to ')
            idx_e = d.find('.m.')+3
            d = d[idx_s:idx_e]
            d = d.replace('.','').upper()
            d = ' '.join(d.split())
        else:
            # More recent press releases -- all contain "Competitive Closing Time"
            idx = list(find_all(d,'Competitive Closing Time'))
            d = d[idx[0]:]
            idx_s = len('Competitive Closing Time ')
            #if d.find('noon')==-1: idx_e = d.find('.m.')+3
            #elif d.find('.m.')==-1: idx_e = d.find('noon')+4
            #else: idx_e = min([ d.find('.m.')+3, d.find('noon')+4])
            # times always followed by ET (eastern time)
            idx_e = d.find('ET')
            d = d[idx_s:idx_e]
            d = d.replace('.','').upper()
            d = ' '.join(d.split())
        return d
    
    pdf_dir = os.path.join(TREASURYDIRECT_DIR, "pdf/pdfFilenameAnnouncement/")
    fnames_pdf = sorted(glob.glob(pdf_dir + "*.pdf"))
    # Save files to manually code
    fnames_notxt = []
    fnames_multiple = []
    closing_times = []
    release_times = []
    for i,fname_pdf in enumerate(fnames_pdf):
        if i%100==0: print('.', end='', flush=True)
        # Original pdf name
        pdf_name = fname_pdf[fname_pdf.rfind("/")+1:]
        pdf_name = pdf_name[pdf_name.find("_")+1:]
        fname = fname_pdf + ".txt"
        with open(fname,"r") as f:
            d = f.read()
        if len(d.strip())==0:
            # Pdf did not convert correctly
            fnames_notxt.append(fname_pdf)
            continue
        # Release info:
        release_time = get_release_time(d)
        release_times.append([pdf_name,release_time])
        # Closing time info:
        closing_time = get_closing_time(d)
        if closing_time==-1:
            fnames_multiple.append(fname_pdf)
        else:
            closing_times.append([pdf_name,closing_time])
    print()
    return release_times, closing_times, fnames_notxt, fnames_multiple

def save_announcement_data():
    """Get announcement data (some work done by hand)."""
    # Missing txt files:
    release_times_notxt = [
        ['ofp012197.pdf', "12:00 NOON"],
        ['ofp040297.pdf', "2:30 PM"],
        ['ofp070297.pdf', "2:30 PM"],
        ['ofp100197.pdf', "2:30 PM"],
        ['ofp123197.pdf', "10:00 AM"],
        ['ofp040198.pdf', "2:30 PM"],
        ['ofp070198.pdf', "2:30 PM"],
        ['ofe20399.pdf', ''],
        ]
    closing_times_notxt = [
        ['ofp012197.pdf', "1:00 PM"],
        ['ofp040297.pdf', "1:00 PM"],
        ['ofp070297.pdf', "1:00 PM"],
        ['ofp100197.pdf', "1:00 PM"],
        ['ofp123197.pdf', "1:00 PM"],
        ['ofp040198.pdf', "1:00 PM"],
        ['ofp070198.pdf', "1:00 PM"],
        ['ofe20399.pdf', "1:00 PM"],
        ]
    # Multiple closing times files:
    closing_times_multiple = [
        ['ofao0898.pdf', "912795AX9", "1:00 PM"],
        ['ofao0898.pdf', "912795BH3", "1:00 PM"],
        ['ofao0898.pdf', "912795CC3", "11:30 AM"],
        ['ofan0598.pdf', "912795BA8",  "1:00 PM"],
        ['ofan0598.pdf', "912795BL4", "1:00 PM"],
        ['ofan0598.pdf', "912795CD1", "11:30 AM"],
        ['ofa110499.pdf', "912795DG3",  "11:30 AM"],
        ['ofa110499.pdf', "912795DV0",  "1:00 PM"],
        ['ofa110499.pdf', "912795EH0",  "1:00 PM"],
        ['ofc52401.pdf', "912795GL9",  "11:30 AM"],
        ['ofc52401.pdf', "912795KJ9",  "11:30 AM"],
        ['ofab122001.pdf', "912795JD4",  "11:30 AM"],
        ['ofab122001.pdf', "912795JM4",  "1:00 PM"],
        ['ofab122001.pdf', "912795KA8",  "1:00 PM"],
        ['ofa051903.pdf', "912795MS7",  "1:00 PM"],
        ['ofa051903.pdf', "912795NJ6",  "1:00 PM"],
        ['ofa051903.pdf', "912795NX5",  "1:00 PM"],
        ['ofa052703.pdf', "912795MT5",  "11:30 AM"],
        ['ofa052703.pdf', "912795NK3",  "1:00 PM"],
        ['ofa052703.pdf', "912795NY3",  "1:00 PM"],
        ['ofc32300.pdf', "912795DS7",  "1:00 PM"],
        ['ofc32300.pdf', "912795GX3",  "11:30 AM"],
        ['ofc22201.pdf', "912795GC9",  "11:30 AM"],
        ['ofc22201.pdf', "912795FX4",  "1:00 PM"],
        ]
    # Get data from txt files
    release_times,closing_times,fnames_notxt,fnames_multiple = get_announcement_data()
    # Combine with manually coded data and add headers
    header_closing = ["pdfFilenameAnnouncement", "closingTimeCompetitive"]
    header_closing_multiple = ["pdfFilenameAnnouncement", "cusip", 
        "closingTimeCompetitive"]
    header_release = ["pdfFilenameAnnouncement", "announcementTime"]
    df_release_times = pd.DataFrame( release_times+release_times_notxt,
        columns=header_release)
    df_closing_times = pd.DataFrame( closing_times_notxt+closing_times,
        columns=header_closing)
    df_closing_times_multiple = pd.DataFrame( closing_times_multiple,
        columns=header_closing_multiple)
    # Drop duplicates
    df_release_times.drop_duplicates(inplace=True)
    df_closing_times.drop_duplicates(inplace=True)
    df_closing_times_multiple.drop_duplicates(inplace=True)
    # Save
    save_treasurydirect(df_release_times, 'release_times', mode='a', format='table')
    save_treasurydirect(df_closing_times, 'closing_times', mode='a', format='table')
    save_treasurydirect(df_closing_times_multiple, 'closing_times_multiple', 
        mode='a', format='table')
    return


# Parsing functions -- closing
def _get_closing_data_missing_txt():
    """load and convert closing stats data from csv file (for missing txt files)"""
    # load
    csv_path = os.path.join(TREASURYDIRECT_DIR, 
        'pdf/pdf_convert_manually/closing_stats_missing_txt.csv')
    df = pd.read_csv(csv_path, engine='c')
    # convert dates
    df['auctionDate'] = pd.to_datetime(df['auctionDate'], format='%b %d %Y')
    # convert numeric data
    numvar_list = ['competitiveTendered', 'competitiveAccepted', 
        'noncompetitiveTendered', 'noncompetitiveAccepted',
        'ForeignOfficialAddOnTendered', 'ForeignOfficialAddOnAccepted',
        'ForeignOfficialInstTendered', 'ForeignOfficialInstAccepted',
        'ForeignOfficialRefundedTendered', 'ForeignOfficialRefundedAccepted', 
        'somaTendered', 'somaAccepted']
    for var in numvar_list:
        # indicator for thousands/millions
        idx_thousand = (df[var].notnull()) & (df[var].str.contains('TH'))
        idx_million = (df[var].notnull()) & (df[var].str.contains('MIL'))
        idx_null = df[var].isnull()
        # convert to numeric and scale up
        df[var] = df[var].str.replace('TH', '')
        df[var] = df[var].str.replace('MIL', '').str.strip().astype(float)
        df[var] = (10**3 * idx_thousand * df[var]) + (10**6 * idx_million * df[var])
    return df

def save_closing_data():
    """Parse txt files for competitive closing data."""
    # Functions
    def parse_lines(lines):
        # Parse data
        cusip = get_cusip(lines)
        amts = get_tendered_accepted(lines)
        # Get bid to cover ratio
        line = [s for s in lines if "bid-to-cover ratio" in s.lower() ][0]
        idx = [m.start() for m in re.finditer('=',line)][1]
        b2c = float(line[idx+2:idx+6])
        data = amts
        data.insert(0,cusip)
        data.append(b2c)
        return data
    def get_fixed_width_idx(lines):
        # Data is fixed width -- get indices
        idx = [i for (i,s) in enumerate(lines) if "Tender Type" in s ][0]
        idxs = [m.start()+3 for m in re.finditer('  --',lines[idx+1])]
        idxs_last = [m.start()+3 for m in re.finditer('--',lines[idx+1])][-1]
        idxs.append(idxs_last)
        return sorted(idxs)
    def get_cusip(lines):
        # Get CUSIP number for auction
        cusip_idx = [i for (i,s) in enumerate(lines)
            if 'CUSIP No:' in s.replace("Number:","No:") ]
        cusip_line = lines[cusip_idx[0]]
        cusip = cusip_line[cusip_line.find("912"):cusip_line.find("912")+9]
        return cusip
    def get_tendered_accepted(lines):
        # Get amount tendered and accepted by bidder
        # Fixed width indices
        idxs = get_fixed_width_idx(lines)
        bidder_types = ['Competitive','Noncompetitive','FIMA (noncompetitive)',
            'Foreign Official AddOn','Foreign Official Inst.',
            'Foreign Official Refunded','Federal Reserve']
        amts = list()
        for b in bidder_types:
            line = [s for s in lines if b in s]
            # (data is in thousands)
            if len(line)>0:
                line_str = line[0][idxs[1]:idxs[2]].replace("$","").replace(",","")
                tendered = 1e3*int(line_str.strip())
                line_str = line[0][idxs[2]:idxs[3]].replace("$","").replace(",","")
                accepted = 1e3*int(line_str.strip())
            else:
                (tendered,accepted) = (np.NaN, np.NaN)
            amts.extend([tendered,accepted])
        return amts
    # Get data
    fnames = glob.glob( os.path.join(TREASURYDIRECT_DIR, 
        "pdf/pdfFilenameCompetitiveResults/*.txt" ) )
    data_all = list()
    for i,fname in enumerate(fnames):
        if i%100==0: print('.', end='', flush=True)
        idx_s = len('/pdf/pdfFilenameCompetitiveResults/')
        idx_e = idx_s+8
        # Get date
        dt = datetime.datetime.strptime(
            fname[fname.find("/pdf/")+idx_s:fname.find("/pdf/")+idx_e],"%Y%m%d")
        # Read txt files with data, that are not already in TD data
        # TD data starts in 4/7/2008
        # TXT files start after 2/16/1999
        # no data for 9/11/2001
        if (dt>=datetime.datetime(2008,4,7) or 
            dt<=datetime.datetime(1999,2,16) or 
            dt==datetime.datetime(2001,9,11)):
            continue
        # Get lines of text
        with open(fname, "r") as f:
            lines = f.readlines()
        # Parse
        data = parse_lines(lines)
        data.insert(0,dt)
        data_all.append(data)
    print()
    
    # Convert to dataframe
    headers = [u'auctionDate',u'cusip',
               u'competitiveTendered',u'competitiveAccepted',
               u'noncompetitiveTendered',u'noncompetitiveAccepted',
               u'fimaNoncompetitiveTendered',u'fimaNoncompetitiveAccepted',
               u'ForeignOfficialAddOnTendered',u'ForeignOfficialAddOnAccepted',
               u'ForeignOfficialInstTendered',u'ForeignOfficialInstAccepted',
               u'ForeignOfficialRefundedTendered',
               u'ForeignOfficialRefundedAccepted',
               u'somaTendered',u'somaAccepted',
               u'bidToCoverRatio']
    df = pd.DataFrame(data_all,columns=headers)
    # combine with hand-collected stats from pdfs with missing txt files
    df_notxt = _get_closing_data_missing_txt()
    df = pd.concat([df, df_notxt])
    df.set_index(['auctionDate', 'cusip'], inplace=True)
    # Save
    save_treasurydirect(df, 'competitive_results', mode='a', format='table')
    return 


# Parsing functions -- supplemental
def save_supplement_data():
    """Additional statistics (previously downloaded and converted)."""
    # http://www.treasurydirect.gov/instit/annceresult/auctdata/auctdata_stat.htm
    # Functions
    def parse_lines(lines):
        # Parse data
        dt = get_date(lines)
        # If multiple auction results break up by CUSIP
        idx = [i for (i,s) in enumerate(lines) if "CUSIP" in s]
        idx.append(-1)
        data_all = list()
        for i in range(len(idx)-1):
            # Save CUSIP string
            cusip_str = lines[idx[i]].replace("CUSIP:","").strip()
            # replace any spaces within cusip
            cusip_str = cusip_str.replace(" ", "")
            lines_cusip = lines[idx[i]:idx[i+1]]
            amts = get_tendered_accepted(lines_cusip)
            # Combine into list of lists
            data = [dt,cusip_str]
            data.extend(amts)
            data_all.append(data)
        return data_all
    def get_date(lines):
        # Find the date in the file
        line = [s for s in lines if "Addendum" in s][0].strip()
        date_str = line[line.find('dated ')+6:]
        # Get rid of commas and spaces, to account for typos
        date_str = date_str.replace(",","")
        date_str = date_str.replace(" ","")
        dt = datetime.datetime.strptime(date_str,"%B%d%Y")
        return dt
    def get_tendered_accepted(lines_cusip):
        # Get amount tendered and accepted by bidder
        # Lines should only contain one set of results
        # Primary dealer
        pd_str = [s for s in lines_cusip if "Primary Dealer" in s][0]
        # Direct bidder
        db_str = [s for s in lines_cusip if "Direct Bidder" in s][0]
        # Indirect bidder
        ib_str = [s for s in lines_cusip if "Indirect Bidder" in s][0]
        # Total competitive (for comparison)
        tc_str = [s for s in lines_cusip if "Total Competitive" in s][0]
        amts_str = " ".join([pd_str,db_str,ib_str,tc_str])
        # Drop commas and find numbers -- correspond to tendered/accepted
        # (data is in thousands)
        amts = [1e3*int(s) for s in amts_str.replace(",","").split() if s.isdigit()]
        return amts
    
    # Get all txt files
    fnames = glob.glob( os.path.join(TREASURYDIRECT_DIR, 
        "pdf/auctdata_stat/*.txt" ) )
    data_all = list()
    for i,fname in enumerate(fnames):
        if i%100==0: print('.', end='', flush=True)
        # Get lines of text
        with open(fname, "r") as f:
            lines = f.readlines()
        # Parase
        data = parse_lines(lines)
        data_all.extend(data)
    print()
    # Convert to dataframe
    headers = [u'auctionDate',u'cusip',
               u'primaryDealerTendered',u'primaryDealerAccepted',
               u'directBidderTendered',u'directBidderAccepted',
               u'indirectBidderTendered',u'indirectBidderAccepted',
               u'competitiveTendered',u'competitiveAccepted']
    df = pd.DataFrame(data_all,columns=headers)
    df.set_index(['auctionDate', 'cusip'], inplace=True)
    save_treasurydirect(df, 'supplement', mode='a', format='table')
    return 


########################################################################
########################################################################

# combined close and release times from various sources
def save_auction_close_release_times():
    """construct combined closing and release times from auctions"""
    # dates and auction info
    df = read_treasurydirect('auction_data', 
        columns=['cusip','auctionDate', 'type',
                 'pdfFilenameAnnouncement','closingTimeCompetitive'])
    # bloomberg timestamps
    df_bloomberg = read_treasurydirect('bloomberg_data', 
        columns=['cusip','auctionDate','bloombergResultsTimestamp'])
    # combined closing times from multiple files
    df_closing_b = read_treasurydirect('closing_times')
    df_closing_c = read_treasurydirect('closing_times_multiple')
    
    # merge and update missing times
    df = df.merge(df_closing_b, on=['pdfFilenameAnnouncement'], 
                  how='left', suffixes=('', '_b') )
    df = df.merge(df_closing_c, on=['cusip','pdfFilenameAnnouncement'], 
                  how='left', suffixes=('', '_c') )
    
    idx_b = df['closingTimeCompetitive']==''
    df.loc[idx_b,'closingTimeCompetitive'] = df.loc[idx_b,'closingTimeCompetitive_b']
    
    idx_c = df['closingTimeCompetitive']==''
    df.loc[idx_c,'closingTimeCompetitive'] = df.loc[idx_c,'closingTimeCompetitive_c']
    # drop extra columns
    df.drop(['closingTimeCompetitive_b','closingTimeCompetitive_c', 
             'pdfFilenameAnnouncement'], axis=1, inplace=True)
    # merge bloomberg release times
    df = df.merge(df_bloomberg, on=['cusip','auctionDate'], how='left' )
    
    # closing time to datetime
    # note: subtracting '00:00:00' converts times to timedelta
    # add auctionDate to get combined correct datetime
    df['close_time'] = df['auctionDate'] + (
        pd.to_datetime(df['closingTimeCompetitive']) - pd.to_datetime('00:00:00')
    )
    # close time: fill in for 1pm auctions released after 1pm
    # other release times: can't distinguish between 11:00am and 11:30am
    idx_missing_1pm = df['close_time'].isnull() & \
        (df['bloombergResultsTimestamp'].dt.hour>=13)
    df.loc[idx_missing_1pm, 'close_time'] = \
        df.loc[idx_missing_1pm, 'auctionDate'] + pd.Timedelta(13, 'h')
    
    # release time (rounded down to nearest 10 mintutes)
    # if after 2005: released immediately
    df['release_time'] = pd.NaT
    idx_2005 = df['auctionDate'].dt.year>=2005
    df.loc[idx_2005, 'release_time'] = df.loc[idx_2005, 'close_time']
    # before 2005: use bloomberg times
    df.loc[~idx_2005, 'release_time'] = df.loc[~idx_2005, 'bloombergResultsTimestamp']
    
    # drop extra columns and save
    df.drop(['closingTimeCompetitive','bloombergResultsTimestamp'], axis=1, inplace=True)
    save_treasurydirect(df, 'close_release_times', mode='a', format='table')
    return 




####################################################################################

# treasurydirect/auction data
def auction_data_to_stata(td_keys=None):
    """read treasury auction data and convert to stata"""
    if td_keys is None:
        td_keys = ['auction_data', 'close_release_times', 'competitive_results',
            'investor_allocations', 'supplement']
    for key in td_keys:
        df = read_treasurydirect(key)
        dta_fname = 'td_%s.dta' % key
        dta_fpath = os.path.join(DATA_DIR, dta_fname)
        # whether to write index
        write_index = not( len(df.index.names)==1 and (df.index.names[0] is None) )
        df.to_stata(dta_fpath, write_index=write_index)
    return




def main():
    # default options
    scrape_data = False
    parse_pdfs = False
    
    if scrape_data:
        # download and clean TreasuryDirect raw data
        print('TreasuryDirect data', flush=True)
        save_treasury_direct_data()
    
        # download and save supplemental allocation data
        print('allocation data', flush=True)
        save_treasury_allocation()
    
        # download PDFs
        print('downloading pdfs and convert to txts...', flush=True)
        download_treasury_direct_pdfs()
    
    if parse_pdfs:
        # save Bloomberg article times
        print('Bloomberg times', flush=True)
        save_bloomberg_auction_release_times()
    
        # parse and save
        print('parsing announcement data...', flush=True)
        save_announcement_data()
        print('parsing closing data...', flush=True)
        save_closing_data()
        print('Parsing supplemental data...', flush=True)
        save_supplement_data()
    
        # combined times
        print('saving combined close/release times...', flush=True)
        save_auction_close_release_times()
    
    # create final datasets    
    print('creating cleaned auction dta files', flush=True)
    auction_data_to_stata()

    return 


if __name__ == '__main__':
    print('Main function starting', flush=True)
    main()
    print('Main function finished.', flush=True)






